Explore Hotel aspects and Predict the rating of each review.
Hotels play a crucial role in traveling and with the increased access to information new pathways of selecting the best ones emerged. With this dataset, consisting of 20k reviews crawled from Tripadvisor, you can explore what makes a great hotel and maybe even use this model in your travels!
This Case Study is taken from : https://www.kaggle.com/andrewmvd/trip-advisor-hotel-reviews
Citation Alam, M. H., Ryu, W.-J., Lee, S., 2016. Joint multi-grain topic sentiment: modeling semantic aspects for online reviews. Information Sciences 339, 206–223. DOI https://zenodo.org/record/1219899#.X64_imgzbIU
Dataset link: https://www.kaggle.com/andrewmvd/trip-advisor-hotel-reviews
The problem given is to predict whether the "review" given is either good or bad by using "rating" as class label. How good the model predicts is to be found based on the metrics given i.e MAE and RMSE.
We can generally understand that if the rating is 3 and above, it is good or else it is bad.
In this task, we have to predict the rating based on the review which should be automaticlly done by our model and that rating which inturn decides the review is either good or bad. As, we have 1-5 rating range for every review, we have to predict the rating in that range only by classification.
This means, the problem is a "Muti-Class Classification" problem.
We can use classification ML techniques like SVM, Decision Trees, Random Forest, Boosting techniques etc., and Deep Learning models too using certain type of loss and activation function.
Finally based on the metrics we have to decide which model to use. The metric we use is F1-Score and it's given to use RMSE and MAE.
I've covered 3 models which are Linear SVM, LGBMClassifer and a Deep Learning Model and analysed results.
# Importing the packages
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from plotly import graph_objs as go
import plotly.figure_factory as ff
import seaborn as sns
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("tripadvisor_hotel_reviews.csv")
df.head()
df.shape
df.dtypes
df[df.isnull().any(axis=1)]
df.isnull().sum()
df['Rating'].hist()
df['Rating'].value_counts()
class_dist = df['Rating'].value_counts()
def ditribution_plot(x,y,name):
fig = go.Figure([
go.Bar(x=x, y=y)
])
fig.update_layout(title_text=name)
fig.show()
ditribution_plot(x= class_dist.index, y= class_dist.values, name= 'Class Distribution')
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(df['Review'])
word_list = vectorizer.get_feature_names()
words2=[]
idf2=vectorizer.idf_
features=np.argsort(idf2)[::-1]
for i in features[0:60]:
words2.append(word_list[i])
print(words2)
from wordcloud import WordCloud
from wordcloud import WordCloud
wordcloud = WordCloud(width = 1200, height = 1000).generate(" ".join(words2))
plt.figure(figsize = (20, 15))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
import re
def decontracted(phrase):
# specific
phrase = re.sub(r"won't", "will not", phrase)
phrase = re.sub(r"can\'t", "can not", phrase)
# general
phrase = re.sub(r"n\'t", " not", phrase)
phrase = re.sub(r"\'re", " are", phrase)
phrase = re.sub(r"\'s", " is", phrase)
phrase = re.sub(r"\'d", " would", phrase)
phrase = re.sub(r"\'ll", " will", phrase)
phrase = re.sub(r"\'t", " not", phrase)
phrase = re.sub(r"\'ve", " have", phrase)
phrase = re.sub(r"\'m", " am", phrase)
return phrase
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
from tqdm import tqdm
preprocessed_reviews = []
# tqdm is for printing the status bar
for sentance in tqdm(df['Review'].values):
sentance = re.sub(r"http\S+", "", sentance)
sentance = BeautifulSoup(sentance, 'lxml').get_text()
sentance = decontracted(sentance)
sentance = re.sub("\S*\d\S*", "", sentance).strip()
sentance = re.sub('[^A-Za-z]+', ' ', sentance)
# https://gist.github.com/sebleier/554280
sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
preprocessed_reviews.append(sentance.strip())
preprocessed_reviews[232]
le = LabelEncoder()
Y = le.fit_transform(df[r'Rating'])
X = np.array(preprocessed_reviews)
X_train,X_cv,y_train,y_cv = train_test_split(X,Y,test_size=0.2)
X_train,X_test,y_train,y_test = train_test_split(X_train,y_train,test_size=0.2)
print(X_train.shape)
print(X_cv.shape)
print(X_test.shape)
count_vect = CountVectorizer(lowercase=True,ngram_range=(1, 2))
X_train=count_vect.fit_transform(X_train)
X_cv=count_vect.transform(X_cv)
X_test=count_vect.transform(X_test)
scalar = StandardScaler(with_mean = False)
X_train_bow = scalar.fit_transform(X_train)
X_test_bow= scalar.transform(X_test)
X_cv_bow=scalar.transform(X_cv)
'''alphas = [10**-3,10**-2, 10**-1,1,10,10**2,10**3,10**4,10**6]
scores = []
for i in alphas:
model = SGDClassifier(alpha = i,loss="hinge")
model.fit(X_train_bow, y_train)
scores.append(model.score(X_cv, y_cv))'''
#optimal_alpha= alphas[scores.index(max(scores))]
#optimal_alpha
#https://stackoverflow.com/questions/55893734/how-can-i-use-sgdclassifier-hinge-loss-with-gridsearchcv-using-log-loss-metric
#%%time
grid_params = {'base_estimator__alpha': [ 10**-3,10**-2,10**-1,1,10,10**2,10**3]}
clf = SGDClassifier(loss='hinge')
calibrated_clf = CalibratedClassifierCV(base_estimator=clf, method='sigmoid', cv=3)
svm_model = GridSearchCV(calibrated_clf, param_grid=grid_params, cv=5)
svm_model.fit(X_train_bow, y_train)
print(svm_model.best_params_)
clf = SGDClassifier(alpha=1, fit_intercept=True,
learning_rate='optimal', loss='hinge',
verbose=0, warm_start=False)
clf.fit(X_train_bow,y_train)
tr_pred = clf.predict(X_train_bow)
cv_pred = clf.predict(X_cv_bow)
te_pred = clf.predict(X_test_bow)
print(confusion_matrix(y_cv,cv_pred))
from sklearn.metrics import confusion_matrix
import seaborn as sns
conf_mat = confusion_matrix(y_train, tr_pred)
class_label = ["1", "2", "3", "4", "5"]
df = pd.DataFrame(conf_mat, index = class_label, columns = class_label)
sns.heatmap(df, annot = True,fmt="d")
plt.title("Confusion Matrix for train data")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
from sklearn.metrics import confusion_matrix
import seaborn as sns
conf_mat = confusion_matrix(y_cv, cv_pred)
class_label = ["1", "2", "3", "4", "5"]
df = pd.DataFrame(conf_mat, index = class_label, columns = class_label)
sns.heatmap(df, annot = True,fmt="d")
plt.title("Confusion Matrix for cv data")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
from sklearn.metrics import confusion_matrix
import seaborn as sns
conf_mat = confusion_matrix(y_test, te_pred)
class_label = ["1", "2", "3", "4", "5"]
df = pd.DataFrame(conf_mat, index = class_label, columns = class_label)
sns.heatmap(df, annot = True,fmt="d")
plt.title("Confusion Matrix for test data")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
from sklearn.metrics import precision_score,recall_score,f1_score
print("Train ",f1_score(y_train,tr_pred,average='micro'))
print("cv ",f1_score(y_cv,cv_pred,average='micro'))
print("Test ",f1_score(y_test,te_pred,average='micro'))
rms = sqrt(mean_squared_error(y_test, te_pred))
print(rms)
from sklearn.metrics import mean_absolute_error
mae=mean_absolute_error(y_test, te_pred)
print(mae)
from lightgbm import LGBMRegressor,LGBMClassifier
from sklearn.model_selection import GridSearchCV
grid={
'max_depth': [30,40,45],
'n_estimators': [4000,4500],
'learning_rate':[0.1,0.2]}
%%time
clf= LGBMClassifier(colsample_bytree=0.8,subsample=0.9,min_child_samples=50,num_leaves=20)
rf_random = GridSearchCV(estimator = clf, param_grid = grid,
cv=3, verbose=1)
rf_random.fit(X_train_bow,y_train,verbose=True)
bestpar=rf_random.best_params_
bestpar
clf=LGBMClassifier(num_leaves=10,colsample_bytree=0.8,subsample=0.9,min_child_samples=50,
learning_rate= 0.1, max_depth= 20, n_estimators= 4000)
clf.fit(X_train_bow,y_train)
predt=clf.predict(X_train_bow)
predcv=clf.predict(X_cv_bow)
predte = clf.predict(X_test_bow)
from sklearn.metrics import confusion_matrix
import seaborn as sns
conf_mat = confusion_matrix(y_train, predt)
class_label = ["1", "2", "3", "4", "5"]
df = pd.DataFrame(conf_mat, index = class_label, columns = class_label)
sns.heatmap(df, annot = True,fmt="d")
plt.title("Confusion Matrix for cv data")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
from sklearn.metrics import confusion_matrix
import seaborn as sns
conf_mat = confusion_matrix(y_cv, predcv)
class_label = ["1", "2", "3", "4", "5"]
df = pd.DataFrame(conf_mat, index = class_label, columns = class_label)
sns.heatmap(df, annot = True,fmt="d")
plt.title("Confusion Matrix for cv data")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
from sklearn.metrics import confusion_matrix
import seaborn as sns
conf_mat = confusion_matrix(y_test, predte)
class_label = ["1", "2", "3", "4", "5"]
df = pd.DataFrame(conf_mat, index = class_label, columns = class_label)
sns.heatmap(df, annot = True,fmt="d")
plt.title("Confusion Matrix for test data")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
predte
print("The train f1-score is", f1_score(y_train,predt,average='micro'))
print("The cv f1-score is", f1_score(y_cv,predcv,average='micro'))
print("The test f1-score is", f1_score(y_test,predte,average='micro'))
rms = sqrt(mean_squared_error(y_test, predte))
print(rms)
from sklearn.metrics import mean_absolute_error
mae=mean_absolute_error(y_test, predte)
print(mae)
import plotly.figure_factory as ff
import gc
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import json
from tensorflow.keras.preprocessing import text, sequence
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from keras.layers import Reshape,Concatenate
from tensorflow.keras.layers import Reshape
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras import regularizers
from tensorflow.keras.layers import LeakyReLU
for i in X_train[:3]:
print(i)
tokenizer = Tokenizer(lower=False, num_words=80000)
for text in tqdm(preprocessed_reviews):
tokenizer.fit_on_texts(text.split(" "))
pickle.dump(tokenizer, open("tokenizertripadv.pickel", "wb"))
tokenizer = pickle.load(open("tokenizertripadv.pickel","rb"))
max_length = max([len(x) for x in X])
vocab_size = len(tokenizer.word_index)+1
exp_sen = 1
max_length
encoding = {1: 0,
2: 1,
3: 2,
4: 3,
5: 4
}
#labels = ['1', '2', '3', '4', '5']
y = df['Rating'].copy()
y.replace(encoding, inplace=True)
X_train,X_cv,y_train,y_cv = train_test_split(X,y,test_size=0.2, random_state=42, stratify=y)
X_train,X_test,y_train,y_test = train_test_split(X_train,y_train,test_size=0.2)
def compute_text(X_train,X_cv,X_test,tokenizer):
#train_text = tokenizer.texts_to_sequences(X_train.text.values)
train = tokenizer.texts_to_sequences(X_train)
cv = tokenizer.texts_to_sequences(X_cv)
test = tokenizer.texts_to_sequences(X_test)
#train_text = sequence.pad_sequences(train_text, maxlen=300)
train = sequence.pad_sequences(train,maxlen=max_length)
cv = sequence.pad_sequences(cv,maxlen=max_length)
test = sequence.pad_sequences(test,maxlen=max_length)
return train,cv,test
train,cv,test = compute_text(X_train,X_cv,X_test,tokenizer)
with open('glove_vectors', 'rb') as f:
glove=pickle.load(f)
glove_words=set(glove.keys())
embedd_matrix= np.zeros((len(tokenizer.word_index)+1,300))
for i,j in tokenizer.word_index.items():
if i in glove_words:
embed_vec=glove[i]
embedd_matrix[j]=embed_vec
print(embed_vec.shape,embedd_matrix.shape)
cv.shape
from tensorflow.keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Masking
from tensorflow.keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, Dropout
from tensorflow.keras.preprocessing import text, sequence
from tqdm import tqdm_notebook as tqdm
import tensorflow as tf
import tensorflow.keras
import pickle
import tensorflow.keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Model
k = tf.keras.initializers.he_normal(seed=None)
train.shape[1:]
text_in = Input(shape=(None,),name='input1')
t= Embedding(*embedd_matrix.shape, weights=[embedd_matrix])(text_in)
#t = Embedding(vocab_size, embedding_dim)(text_in)
t= layers.Bidirectional(tf.keras.layers.LSTM(32, activation="tanh",recurrent_activation="sigmoid",
return_sequences=True))(t)
t=tf.keras.layers.LeakyReLU(alpha=0.3)(t)
t= tensorflow.keras.layers.GlobalMaxPooling1D()(t)
hidden = Dense(100, activation='relu',kernel_initializer=k)(t)
hidden=tf.keras.layers.LeakyReLU(alpha=0.3)(hidden)
hidden = Dropout(0.5)(hidden)
hidden = Dense(96, activation='relu',kernel_initializer=k)(hidden)
hidden=tf.keras.layers.LeakyReLU(alpha=0.3)(hidden)
hidden = Dropout(0.5)(hidden)
hidden = Dense(100, activation='relu',kernel_initializer=k)(hidden)
out1 = Dense(5, activation='softmax',name='out1')(hidden)
model = Model(inputs=[text_in], outputs=[out1])
model.compile(loss="sparse_categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=0.03),
metrics=['accuracy'])
model.summary()
EPOCHS = 3
BATCH_SIZE = 100
history = model.fit(train, y_train, epochs=EPOCHS, validation_split=0.25, batch_size=BATCH_SIZE,
verbose=2)
The training is taking more time than expected. So, I'm skipping for now.
from prettytable import PrettyTable
x=PrettyTable()
x.field_names=(['Model','Test F1-Score','MAE','RMSE'])
x.add_row(['Linear SVM','0.525','1.142','0.691'])
x.add_row(['LGBMClassifier','0.593','0.830','0.48'])
x.add_row(['LSTM Model','-','-','-'])
print(x)
By the above results we can LGBMClassifier performs better compared to Linear SVM.